{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fbcf8801-018a-4f78-b0ea-cb83e4660e96",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "\n",
    "# Create a SparkSession\n",
    "spark = SparkSession.builder \\\n",
    "    .appName(\"Create-DataFrame\") \\\n",
    "    .master(\"spark://spark-master:7077\") \\\n",
    "    .getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d12d97d-4961-4b5e-b571-8df65be48d94",
   "metadata": {},
   "source": [
    "### Read CSV file into DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b8446479-fc5d-4422-9a1e-129ece15f941",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id,name,category,quantity,price\n",
      "1,iPhone 12,Electronics,10,899.99\n",
      "2,Nike Air Max 90,Clothing,25,119.99\n",
      "3,KitchenAid Stand Mixer,Home Appliances,5,299.99\n",
      "4,The Great Gatsby,Books,50,12.99\n",
      "5,L'Oreal Paris Mascara,Beauty,100,9.99\n",
      "6,Yoga Mat,Sports,30,29.99\n",
      "7,Samsung 4K Smart TV,Electronics,8,799.99\n",
      "8,Levi's Jeans,Clothing,15,49.99\n",
      "9,Dyson Vacuum Cleaner,Home Appliances,3,399.99\n"
     ]
    }
   ],
   "source": [
    "%%bash \n",
    "head -10 ../input_files/products.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "65f9db29-2efb-4874-a512-9add235d292d",
   "metadata": {},
   "source": [
    "#### Read CSV with header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7580ed5e-8cac-4af5-8955-1eea0ffe4c0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "# Read CSV file into DataFrame\n",
    "csv_file_path = \"hdfs://namenode:9000/input_files/products.csv\"\n",
    "df = spark.read.csv(csv_file_path, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e595ec04-655d-4caa-91ec-d8e298c3183b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- category: string (nullable = true)\n",
      " |-- quantity: string (nullable = true)\n",
      " |-- price: string (nullable = true)\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Stage 5:>                                                          (0 + 1) / 1]\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+---------------+--------+------+\n",
      "| id|                name|       category|quantity| price|\n",
      "+---+--------------------+---------------+--------+------+\n",
      "|  1|           iPhone 12|    Electronics|      10|899.99|\n",
      "|  2|     Nike Air Max 90|       Clothing|      25|119.99|\n",
      "|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|\n",
      "|  4|    The Great Gatsby|          Books|      50| 12.99|\n",
      "|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|\n",
      "+---+--------------------+---------------+--------+------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "# Display schema of DataFrame\n",
    "df.printSchema()\n",
    "\n",
    "# Display content of DataFrame\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f43038c-30e3-45fc-b927-f1c62c1cdf84",
   "metadata": {},
   "source": [
    "#### Read CSV with an explicit schema definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6a105df6-1196-4a1c-95df-a54fd699986d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import necessary types\n",
    "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "cecfacfc-5488-40a8-b26e-fefc07c61d88",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the schema\n",
    "schema = StructType([\n",
    "    StructField(name=\"id\", dataType=IntegerType(), nullable=True),\n",
    "    StructField(name=\"name\", dataType=StringType(), nullable=True),\n",
    "    StructField(name=\"category\", dataType=StringType(), nullable=True),\n",
    "    StructField(name=\"quantity\", dataType=IntegerType(), nullable=True),\n",
    "    StructField(name=\"price\", dataType=DoubleType(), nullable=True)\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "268f68db-1f89-4b94-979b-da0aa16b990f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read CSV file into DataFrame with schema definition\n",
    "csv_file_path = \"hdfs://namenode:9000/input_files/products.csv\"\n",
    "df = spark.read.csv(csv_file_path, header=True, schema=schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d1ffe583-5460-484c-b136-ee4715f4b0d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: integer (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- category: string (nullable = true)\n",
      " |-- quantity: integer (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      "\n",
      "+---+--------------------+---------------+--------+------+\n",
      "| id|                name|       category|quantity| price|\n",
      "+---+--------------------+---------------+--------+------+\n",
      "|  1|           iPhone 12|    Electronics|      10|899.99|\n",
      "|  2|     Nike Air Max 90|       Clothing|      25|119.99|\n",
      "|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|\n",
      "|  4|    The Great Gatsby|          Books|      50| 12.99|\n",
      "|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|\n",
      "+---+--------------------+---------------+--------+------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Display schema of DataFrame\n",
    "df.printSchema()\n",
    "\n",
    "# Display content of DataFrame\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f099209-1c77-4c2b-8142-944ad92d4723",
   "metadata": {},
   "source": [
    "#### Read CSV with inferSchema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "13f37a98-6810-43f5-8229-267d6528cea5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "# Read CSV file into DataFrame with inferSchema\n",
    "csv_file_path = \"hdfs://namenode:9000/input_files/products.csv\"\n",
    "df = spark.read.csv(csv_file_path, header=True, inferSchema=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "c53d37bd-6bc6-4eaf-b0c0-879d3302df8d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: integer (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- category: string (nullable = true)\n",
      " |-- quantity: integer (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      "\n",
      "+---+--------------------+---------------+--------+------+\n",
      "| id|                name|       category|quantity| price|\n",
      "+---+--------------------+---------------+--------+------+\n",
      "|  1|           iPhone 12|    Electronics|      10|899.99|\n",
      "|  2|     Nike Air Max 90|       Clothing|      25|119.99|\n",
      "|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|\n",
      "|  4|    The Great Gatsby|          Books|      50| 12.99|\n",
      "|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|\n",
      "+---+--------------------+---------------+--------+------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Display schema of DataFrame\n",
    "df.printSchema()\n",
    "\n",
    "# Display content of DataFrame\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16cd7777-ced0-4f55-9825-f6a868636d47",
   "metadata": {},
   "source": [
    "### Read JSON file into DataFrame"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d339da0-ea17-474b-82e8-54785bc2ecf3",
   "metadata": {},
   "source": [
    "#### Single Line JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d3fb6fdc-da18-4edd-a815-3152cfc2dcfd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"id\":1,\"name\":\"iPhone 12\",\"category\":\"Electronics\",\"quantity\":10,\"price\":899.99}\n",
      "{\"id\":2,\"name\":\"Nike Air Max 90\",\"category\":\"Clothing\",\"quantity\":25,\"price\":119.99}\n",
      "{\"id\":3,\"name\":\"KitchenAid Stand Mixer\",\"category\":\"Home Appliances\",\"quantity\":5,\"price\":299.99}\n",
      "{\"id\":4,\"name\":\"The Great Gatsby\",\"category\":\"Books\",\"quantity\":50,\"price\":12.99}\n",
      "{\"id\":5,\"name\":\"L'Oreal Paris Mascara\",\"category\":\"Beauty\",\"quantity\":100,\"price\":9.99}\n",
      "{\"id\":6,\"name\":\"Yoga Mat\",\"category\":\"Sports\",\"quantity\":30,\"price\":29.99}\n",
      "{\"id\":7,\"name\":\"Samsung 4K Smart TV\",\"category\":\"Electronics\",\"quantity\":8,\"price\":799.99}\n",
      "{\"id\":8,\"name\":\"Levi's Jeans\",\"category\":\"Clothing\",\"quantity\":15,\"price\":49.99}\n",
      "{\"id\":9,\"name\":\"Dyson Vacuum Cleaner\",\"category\":\"Home Appliances\",\"quantity\":3,\"price\":399.99}\n",
      "{\"id\":10,\"name\":\"Harry Potter Series\",\"category\":\"Books\",\"quantity\":20,\"price\":15.99}\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head -10 ../input_files/products_singleline.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6cfe5f2b-7720-403b-a8bf-aec931bca199",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read single line JSON\n",
    "# Each row is a JSON record, records are separated by new line\n",
    "json_file_path = \"hdfs://namenode:9000/input_files/products_singleline.json\"\n",
    "df = spark.read.json(json_file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "0cc72e44-36ca-44cb-8d31-36f473e52f9a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- category: string (nullable = true)\n",
      " |-- id: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      " |-- quantity: long (nullable = true)\n",
      "\n",
      "+---------------+---+--------------------+------+--------+\n",
      "|       category| id|                name| price|quantity|\n",
      "+---------------+---+--------------------+------+--------+\n",
      "|    Electronics|  1|           iPhone 12|899.99|      10|\n",
      "|       Clothing|  2|     Nike Air Max 90|119.99|      25|\n",
      "|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|\n",
      "|          Books|  4|    The Great Gatsby| 12.99|      50|\n",
      "|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|\n",
      "+---------------+---+--------------------+------+--------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Display schema of DataFrame\n",
    "df.printSchema()\n",
    "\n",
    "# Display content of DataFrame\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6f7db26b-be18-4602-9cd8-780fc82294f6",
   "metadata": {},
   "source": [
    "#### Multi-lines JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c76efef7-a884-464e-8132-dabfde493dfb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\n",
      "  {\n",
      "    \"id\": 1,\n",
      "    \"name\": \"iPhone 12\",\n",
      "    \"category\": \"Electronics\",\n",
      "    \"quantity\": 10,\n",
      "    \"price\": 899.99\n",
      "  },\n",
      "  {\n",
      "    \"id\": 2,\n",
      "    \"name\": \"Nike Air Max 90\",\n",
      "    \"category\": \"Clothing\",\n",
      "    \"quantity\": 25,\n",
      "    \"price\": 119.99\n",
      "  },\n",
      "  {\n",
      "    \"id\": 3,\n",
      "    \"name\": \"KitchenAid Stand Mixer\",\n",
      "    \"category\": \"Home Appliances\",\n",
      "    \"quantity\": 5,\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head -20 ../input_files/products_multiline.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "89755543-83fd-4f85-8829-e34ec16e9dec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read multi-line JSON\n",
    "# JSON is an array of record, records are separated by a comma.\n",
    "# each record is defined in multiple lines\n",
    "json_file_path = \"hdfs://namenode:9000/input_files/products_multiline.json\"\n",
    "df = spark.read.json(json_file_path, multiLine=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "c0aa892a-6784-424f-b40a-7469035cd891",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- category: string (nullable = true)\n",
      " |-- id: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      " |-- quantity: long (nullable = true)\n",
      "\n",
      "+---------------+---+--------------------+------+--------+\n",
      "|       category| id|                name| price|quantity|\n",
      "+---------------+---+--------------------+------+--------+\n",
      "|    Electronics|  1|           iPhone 12|899.99|      10|\n",
      "|       Clothing|  2|     Nike Air Max 90|119.99|      25|\n",
      "|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|\n",
      "|          Books|  4|    The Great Gatsby| 12.99|      50|\n",
      "|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|\n",
      "+---------------+---+--------------------+------+--------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Display schema of DataFrame\n",
    "df.printSchema()\n",
    "\n",
    "# Display content of DataFrame\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c0e3b6a8-6273-407c-b523-3b6a9b795d73",
   "metadata": {},
   "source": [
    "### Read parquet file into DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "98025d25-3cd5-4ee4-9218-60d9b206cee0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.parquet(parquet_file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "394fea18-891c-4422-b484-bfb8cadb4b51",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- category: string (nullable = true)\n",
      " |-- id: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      " |-- quantity: long (nullable = true)\n",
      "\n",
      "+---------------+---+--------------------+------+--------+\n",
      "|       category| id|                name| price|quantity|\n",
      "+---------------+---+--------------------+------+--------+\n",
      "|    Electronics|  1|           iPhone 12|899.99|      10|\n",
      "|       Clothing|  2|     Nike Air Max 90|119.99|      25|\n",
      "|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|\n",
      "|          Books|  4|    The Great Gatsby| 12.99|      50|\n",
      "|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|\n",
      "+---------------+---+--------------------+------+--------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "25/01/17 22:04:43 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED\n",
      "25/01/17 22:04:43 ERROR Inbox: Ignoring error\n",
      "org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED\n",
      "\tat org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:919)\n",
      "\tat org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:154)\n",
      "\tat org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:262)\n",
      "\tat org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:169)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)\n",
      "\tat org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$MessageLoop$$receiveLoop(MessageLoop.scala:75)\n",
      "\tat org.apache.spark.rpc.netty.MessageLoop$$anon$1.run(MessageLoop.scala:41)\n",
      "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n",
      "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n",
      "\tat java.lang.Thread.run(Thread.java:750)\n",
      "25/01/17 22:04:43 ERROR TaskSchedulerImpl: Lost executor 1 on 172.22.0.10: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.\n",
      "25/01/17 22:04:43 ERROR TaskSchedulerImpl: Lost executor 2 on 172.22.0.11: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.\n",
      "25/01/17 22:04:43 ERROR Utils: Uncaught exception in thread dispatcher-CoarseGrainedScheduler\n",
      "org.apache.spark.SparkException: Could not find CoarseGrainedScheduler.\n",
      "\tat org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:178)\n",
      "\tat org.apache.spark.rpc.netty.Dispatcher.postOneWayMessage(Dispatcher.scala:150)\n",
      "\tat org.apache.spark.rpc.netty.NettyRpcEnv.send(NettyRpcEnv.scala:193)\n",
      "\tat org.apache.spark.rpc.netty.NettyRpcEndpointRef.send(NettyRpcEnv.scala:564)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.$anonfun$reviveOffers$1(CoarseGrainedSchedulerBackend.scala:618)\n",
      "\tat org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1442)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.reviveOffers(CoarseGrainedSchedulerBackend.scala:618)\n",
      "\tat org.apache.spark.scheduler.TaskSchedulerImpl.executorLost(TaskSchedulerImpl.scala:1000)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.org$apache$spark$scheduler$cluster$CoarseGrainedSchedulerBackend$DriverEndpoint$$removeExecutor(CoarseGrainedSchedulerBackend.scala:430)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.$anonfun$onDisconnected$1(CoarseGrainedSchedulerBackend.scala:341)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.$anonfun$onDisconnected$1$adapted(CoarseGrainedSchedulerBackend.scala:340)\n",
      "\tat scala.Option.foreach(Option.scala:407)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.onDisconnected(CoarseGrainedSchedulerBackend.scala:340)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:141)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)\n",
      "\tat org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$MessageLoop$$receiveLoop(MessageLoop.scala:75)\n",
      "\tat org.apache.spark.rpc.netty.MessageLoop$$anon$1.run(MessageLoop.scala:41)\n",
      "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n",
      "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n",
      "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n",
      "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n",
      "\tat java.lang.Thread.run(Thread.java:750)\n",
      "25/01/17 22:04:43 ERROR TaskSchedulerImpl: Lost executor 0 on 172.22.0.12: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.\n",
      "25/01/17 22:04:43 ERROR Utils: Uncaught exception in thread dispatcher-CoarseGrainedScheduler\n",
      "org.apache.spark.SparkException: Could not find CoarseGrainedScheduler.\n",
      "\tat org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:178)\n",
      "\tat org.apache.spark.rpc.netty.Dispatcher.postOneWayMessage(Dispatcher.scala:150)\n",
      "\tat org.apache.spark.rpc.netty.NettyRpcEnv.send(NettyRpcEnv.scala:193)\n",
      "\tat org.apache.spark.rpc.netty.NettyRpcEndpointRef.send(NettyRpcEnv.scala:564)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.$anonfun$reviveOffers$1(CoarseGrainedSchedulerBackend.scala:618)\n",
      "\tat org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1442)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.reviveOffers(CoarseGrainedSchedulerBackend.scala:618)\n",
      "\tat org.apache.spark.scheduler.TaskSchedulerImpl.executorLost(TaskSchedulerImpl.scala:1000)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.org$apache$spark$scheduler$cluster$CoarseGrainedSchedulerBackend$DriverEndpoint$$removeExecutor(CoarseGrainedSchedulerBackend.scala:430)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.$anonfun$onDisconnected$1(CoarseGrainedSchedulerBackend.scala:341)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.$anonfun$onDisconnected$1$adapted(CoarseGrainedSchedulerBackend.scala:340)\n",
      "\tat scala.Option.foreach(Option.scala:407)\n",
      "\tat org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.onDisconnected(CoarseGrainedSchedulerBackend.scala:340)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:141)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)\n",
      "\tat org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)\n",
      "\tat org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$MessageLoop$$receiveLoop(MessageLoop.scala:75)\n",
      "\tat org.apache.spark.rpc.netty.MessageLoop$$anon$1.run(MessageLoop.scala:41)\n",
      "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n",
      "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n",
      "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n",
      "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n",
      "\tat java.lang.Thread.run(Thread.java:750)\n"
     ]
    }
   ],
   "source": [
    "# Display schema of DataFrame\n",
    "df.printSchema()\n",
    "\n",
    "# Display content of DataFrame\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "27689ca9-304b-40b0-970d-7459545b4983",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1771b724-c7a5-4f5a-97c1-0dac5f1b03c9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4657b0b-5d1e-4987-863a-bbbc9bc564b0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pyspark",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
